import math
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head()
The primary key needs to have a unique identifier for each row in our data frame
Duplicates were found after comparing the number of unique values with the number of counts within the UID column
All duplicate rows were dropped
UID is set as a unique index
print('Train: The number of unique values {} vs the number of all values {}'.format(df_train[['UID']].nunique()[0],df_train[['UID']].count()[0]))
print('Test: The number of unique values {} vs the number of all values {}'.format(df_test[['UID']].nunique()[0],df_test[['UID']].count()[0]))
# drop duplicate rows
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)
print('Check if duplicates are removed in Train and Test:')
print(df_train['UID'].nunique() == df_train['UID'].count())
print(df_test['UID'].nunique() == df_test['UID'].count())
df_train = df_train.set_index('UID')
df_train.head()
df_test = df_test.set_index('UID')
df_test.head()
nulls_in_train = df_train.isnull().sum()[df_train.isnull().sum() > 0]
nulls_in_Test = df_test.isnull().sum()[df_test.isnull().sum() > 0]
n1 = df_train.shape[0]
n2 = df_test.shape[0]
p1 = round((nulls_in_train/n1) * 100, 2)
p2 = round((nulls_in_Test/n2) * 100, 2)
pd.DataFrame(list(zip(nulls_in_train, nulls_in_Test, p1, p2)), columns=['train NaN','test NaN','train %','test %'],index=nulls_in_train.index)
Here we will drop the column, BLOCKID, due to having all NaN values
Since none of the categorical columns have NaN values, we will impute the missing values with the mean.
# The Column, BLOCKID, contains all NaN values for every row
df_train.drop(columns=['BLOCKID'],inplace=True)
df_test.drop(columns=['BLOCKID'],inplace=True)
for DF in [df_train,df_test]:
for column in DF.columns:
if (DF[column].isnull().sum() > 0):
DF[column].replace(np.nan, DF[column].mean(),inplace=True)
haveNulls = df_train.isnull().sum() > 0
(haveNulls).sum()
haveNulls = df_test.isnull().sum() > 0
(haveNulls).sum()
# Need to drop rows with zero population
(df_train[df_train['pop']==0]).shape
(df_test[df_test['pop']==0]).shape
# drop all rows with population equal to zero
df_train.drop(df_train[df_train['pop']==0].index, inplace = True)
df_train.shape
df_test.drop(df_test[df_test['pop']==0].index, inplace = True)
df_test.shape
primary and SUMLEVEL
df_train.drop(columns=['SUMLEVEL','primary'],inplace=True)
df_test.drop(columns=['SUMLEVEL','primary'],inplace=True)
print(df_train.shape)
df_test.shape
df_train.columns
4.Perform debt analysis. You may take the following steps:
Explore the top 2,500 locations where the percentage of households with a second mortgage is the highest and percent ownership is above 10 percent. Visualize using geo-map. You may keep the upper limit for the percent of households with a second mortgage to 50 percent
Use the following bad debt equation: Bad Debt = P (Second Mortgage ∩ Home Equity Loan) Bad Debt = second_mortgage + home_equity - home_equity_second_mortgage c) Create pie charts to show overall debt and bad debt
Create Box and whisker plot and analyze the distribution for 2nd mortgage, home equity, good debt, and bad debt for different cities
Create a collated income distribution chart for family income, house hold income, and remaining income
df_train['bad_debt'] = df_train['second_mortgage'] + df_train['home_equity'] - df_train['home_equity_second_mortgage']
df_test['bad_debt'] = df_test['second_mortgage'] + df_test['home_equity'] - df_test['home_equity_second_mortgage']
df_train['good_debt'] = df_train['debt'] - df_train['bad_debt']
df_data = df_train[(df_train['pct_own'] > 0.1) & (df_train['second_mortgage'] < 0.5)]
df_data = df_data.sort_values(by='second_mortgage',ascending=False)
top_household_2nd_mortgage = df_data[:2500]
top_household_2nd_mortgage[['place','pct_own','second_mortgage']].head()
top_household_2nd_mortgage[['city','pop','pct_own','second_mortgage','bad_debt','good_debt','debt']].tail()
!python -m pip install plotly
#using: https://plotly.com/python/scatter-plots-on-maps/
fig = go.Figure(data=go.Scattergeo(
locationmode = 'USA-states',
lat = top_household_2nd_mortgage['lat'],
lon = top_household_2nd_mortgage['lng'],
text = top_household_2nd_mortgage['place'],
marker = dict(
size = 5,
#color='rgb(242, 177, 172)',
color='rgb(178,34,34)',
opacity = 0.5,)),
)
fig.update_layout(
title_text='Top 2,500 locations - highest percentage with second mortgage and greater than 0.10 ownership',
showlegend = False,
geo = dict(
scope = 'usa',
resolution = 50,
showland = True,
showlakes = True,
showsubunits = True,
showcountries = True,
landcolor = 'rgb(204, 204, 204)',
countrycolor = 'rgb(204, 204, 204)',
lakecolor = 'rgb(173, 216, 230)',
projection_type='albers usa',
countrywidth = 0.5,
#subunitcolor="Blue",
coastlinewidth = 2,
lonaxis = dict(
showgrid = True,
gridwidth = 0.5,
range= [ -150.0, -65.0 ],
dtick = 5
),
lataxis = dict (
showgrid = True,
gridwidth = 0.5,
range= [ 45.0, 85.0 ],
dtick = 5
),
)
)
fig.show()
top_household_2nd_mortgage['bins1'] = pd.cut(top_household_2nd_mortgage['bad_debt'],bins=[0,0.1,0.2,0.4,1], labels=["0%-10% with bad debt","10%-20% with bad debt","20%-40% with bad debt","40% or more with bad debt"])
top_household_2nd_mortgage['bins2'] = pd.cut(top_household_2nd_mortgage['debt'],bins=[0,0.4,0.6,0.8,1.0], labels=['0%-40% with debt','40%-60% with debt','60%-80% with debt','80%-100% with debt'])
df_bins1= top_household_2nd_mortgage.groupby(['bins1']).size()
df_bins2= top_household_2nd_mortgage.groupby(['bins2']).size()
top_household_2nd_mortgage['bins1'] = top_household_2nd_mortgage['bins1'].replace(np.nan,'0%-10% bad debt')
top_household_2nd_mortgage[['city','zip_code','bad_debt','good_debt','debt','bins1','bins2']].head(10)
top_household_2nd_mortgage[['debt','good_debt','bad_debt']].mean()
sns.set()
plt.subplots(figsize=(16, 18))
plt.subplot(1, 2, 1)
explode1 = (0.0, 0.0, 0.1, 0.1)
explode2 = (0.0, 0.0, 0.0, 0.1)
colors = ( "green", "orange", "yellow", "red")
plt.title("top 2500 household with bad debt")
df_bins1.plot.pie(explode=explode1, colors=colors, startangle=120,labels=None, autopct='%1.1f%%',legend=True)
plt.axis('equal')
plt.ylabel("bad debt")
plt.subplot(1, 2, 2)
plt.title("top 2500 household with overall debt")
df_bins2.plot.pie(explode=explode2,startangle=120, autopct='%1.1f%%', labels=None, legend=True)
plt.axis('equal')
plt.ylabel("overall debt")
plt.show()
df = (df_train.groupby("city")["debt"]).agg(['count']).sort_values("count",ascending=False)
df.head()
df_city = df.head(12)
df_city.index
cities = list(df_city.index)
cities
df1 = pd.DataFrame()
for city in cities:
print(city)
df1 = df1.append(df_train[df_train['city'] == city])
plt.figure(figsize=(16, 8))
sns.boxplot(data=df1,palette="bright", x='city',y='second_mortgage')
plt.figure(figsize=(16, 8))
sns.boxplot(data=df1,palette="bright", x='city',y='home_equity')
plt.figure(figsize=(16, 8))
sns.boxplot(data=df1,palette="bright", x='city',y='good_debt')
plt.figure(figsize=(16, 8))
sns.boxplot(data=df1,palette="bright", x='city',y='bad_debt')
df_train['remaining_income'] = df_train['family_mean'] - df_train['hi_mean']
df_train[['city','family_mean','hi_mean','remaining_income']].head(10)
plt.subplots(figsize=(16, 5))
plt.subplot(1, 3, 1)
sns.histplot(df_train['hi_mean'])
plt.subplot(1, 3, 2)
sns.histplot(df_train['family_mean'])
plt.subplot(1, 3, 3)
sns.histplot(df_train['remaining_income'])
plt.show()
plt.figure(figsize=(25, 15))
#sns.boxplot(df_train['hi_mean'])
sns.displot(data = df_train[['hi_mean','family_mean','remaining_income']], kind="kde",height=6,aspect=1.8)
plt.title('collated income distribution chart')
plt.xlabel("Income")
plt.show()
Perform EDA and come out with insights into population density and age. You may have to derive new fields (make sure to weight averages for accurate measurements):
Use pop and ALand variables to create a new field called population density
Use male_age_median, female_age_median, male_pop, and female_pop to create a new field called median age c) Visualize the findings using appropriate chart type
Create bins for population into a new variable by selecting appropriate class interval so that the number of categories don’t exceed 5 for the ease of analysis.
Please detail your observations for rent as a percentage of income at an overall level, and for different states.
Perform correlation analysis for all the relevant variables by creating a heatmap. Describe your findings.
df_train['pop_density']=df_train['pop']/df_train['ALand']
df_test['pop_density']=df_test['pop']/df_test['ALand']
mp = df_train['male_age_median'] * df_train['male_pop']
fp = df_train['female_age_median'] * df_train['female_pop']
p = df_train['pop']
df_train['age_median'] = (mp + fp) / p
mp = df_test['male_age_median'] * df_test['male_pop']
fp = df_test['female_age_median'] * df_test['female_pop']
p = df_test['pop']
df_test['age_median'] = (mp + fp) / p
df_train[['city','state','male_age_median','female_age_median','age_median','male_pop','female_pop','pop','pop_density']].head()
plt.subplots(figsize=(18, 7))
plt.subplot(1, 2, 1)
sns.histplot(data=df_train['age_median'])
plt.title('Median Age')
plt.subplot(1, 2, 2)
sns.boxplot(x=df_train['age_median'])
plt.title('Median Age')
plt.show
df_population = df_train[['state', 'pop','pop_density', 'male_pop', 'female_pop', 'rent_mean', 'family_mean', 'age_median', 'married', 'separated', 'divorced']]
bins = [0,17,30,45,65,100]
age_group = ['youth',' adult','middle','older','senior']
df_population['age group'] = pd.cut(df_population['age_median'], bins=bins, labels = age_group)
df_population.head()
df_population['age_median'].isnull().sum()
df_train['age_median'].describe()
sns.catplot(x="age group", kind="count",height=6,aspect=1.4, data=df_population)
df_population.head()
sns.set()
plt.subplots(figsize=(10, 12))
plt.subplot(2, 1, 1)
sns.boxplot(x=df_population['pop'])
plt.title('population')
plt.subplot(2, 1, 2)
sns.boxplot(x=df_population['pop_density'])
plt.title('population density')
plt.show
table = pd.pivot_table(data=df_population,index=['state'],aggfunc = np.mean)
table.head()
plt.figure(figsize = (12, 7))
sns.scatterplot(x='married', y='age_median', data=table, palette='bright')
sns.scatterplot(x='divorced', y='age_median', data=table, palette='bright')
sns.scatterplot(x='separated', y='age_median', data=table, palette='bright')
plt.xlabel("Status", fontsize = 15)
plt.ylabel("Median Age", fontsize = 15)
plt.legend(labels=['married','divorced','separated'])
plt.show()
states = list(table.index)
sns.set(font_scale = 4)
plt.subplots(figsize=(70, 60))
plt.subplot(2, 1, 1)
sns.barplot(x = "state", y = 'married', hue = 'age group', order=states[0:26], palette="bright", data=df_population)
x = plt.xticks(rotation=90)
plt.subplot(2, 1, 2)
sns.set(font_scale = 3)
sns.barplot(x = "state", y = 'married', hue = 'age group', order=states[26:52], palette="bright", data=df_population)
x = plt.xticks(rotation=90)
In certain states like Delaware, North Carolina and South Carolina there are 80 or more percent of seniors who remain married. These states are ideal locations for couples after retirement. Each states shows a consistent number of middle aged couples surpassing 50 percent and older couples surpassing 60 percent.
plt.subplots(figsize=(50, 60))
plt.subplot(2, 1, 1)
sns.barplot(x = "state", y = 'divorced', hue = 'age group',order=states[0:26], palette="bright", data=df_population)
sns.set(font_scale = 3)
x = plt.xticks(rotation=90)
plt.subplot(2, 1, 2)
sns.barplot(x = "state", y = 'divorced', hue = 'age group',order=states[26:52], palette="bright", data=df_population)
sns.set(font_scale = 3)
x = plt.xticks(rotation=90)
plt.subplots(figsize=(50, 60))
plt.subplot(2, 1, 1)
sns.barplot(x = "state", y = 'separated', hue = 'age group',order=states[0:26], palette="bright", data=df_population)
sns.set(font_scale = 3)
x = plt.xticks(rotation=90)
plt.subplot(2, 1, 2)
sns.barplot(x = "state", y = 'separated', hue = 'age group',order=states[26:52], palette="bright", data=df_population)
sns.set(font_scale = 3)
x = plt.xticks(rotation=90)
df_train['rent_to_income'] = df_train['rent_mean'] / df_train['family_mean']
rent_to_income = df_train['rent_to_income'].mean() * 100
rent_to_income
rent_to_income_state = df_train.groupby(by='state')['rent_to_income'].agg(['mean'])
rent_to_income_state = round(rent_to_income_state['mean'] * 100,2)
rent_to_income_state
cor=df_train[['COUNTYID', 'STATEID','zip_code','area_code','type' ,'pop','pop_density', 'rent_mean','rent_gt_10','rent_gt_15','hi_mean', 'family_mean', 'hc_mean',
'second_mortgage','hc_mortgage_mean', 'hc_mortgage_median', 'hc_mortgage_samples',
'home_equity', 'debt', 'bad_debt', 'hs_degree', 'age_median', 'pct_own', 'married', 'separated', 'divorced']].corr()
sns.set()
plt.figure(figsize=(20,15))
sns.heatmap(cor,annot=True)
plt.show()
interesting find:
The economic multivariate data has a significant number of measured variables. The goal is to find where the measured variables depend on a number of smaller unobserved common factors or latent variables. 2. Each variable is assumed to be dependent upon a linear combination of the common factors, and the coefficients are known as loadings. Each measured variable also includes a component due to independent random variability, known as “specific variance” because it is specific to one variable. Obtain the common factors and then plot the loadings. Use factor analysis to find latent variables in our dataset and gain insight into the linear relationships in the data. Following are the list of latent variables:
Highschool graduation rates
Median population age
Second mortgage statistics
Percent own
Bad debt expense
!pip install factor_analyzer
from factor_analyzer import FactorAnalyzer
import warnings
warnings.filterwarnings('ignore')
df = df_train.select_dtypes(include = 'float')
fa = FactorAnalyzer(n_factors = 5, rotation='varimax')
fa.fit(df)
ev, v = fa.get_eigenvalues()
ev
np.set_printoptions(suppress=True)
#sorted(ev)
sns.set_theme(style="whitegrid")
plt.figure(figsize = (12,9))
plt.scatter(range(1, df.shape[1]+1), ev)
plt.plot(range(1, df.shape[1]+1), ev)
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.show()
fa = FactorAnalyzer(n_factors = 12, rotation='varimax')
fa.fit(df)
np.set_printoptions(suppress=True)
loadings = pd.DataFrame(fa.loadings_, columns=['factor1', 'factor2', 'factor3', 'factor4','factor5','factor6', 'factor7', 'factor8', 'factor9','factor10','factor11','factor12'], index=df.columns)
#loadings
sns.set()
plt.figure(figsize=(20,17))
sns.heatmap(loadings,annot=True)
plt.show()
loadings.index
# factor 1 - 'rent_mean', 'rent_median', 'rent_stdev', 'hi_mean', 'hi_median', 'hi_stdev',
# 'family_mean', 'family_median', 'family_stdev'
# 'hc_mortgage_mean','hc_mortgage_median', 'hc_mortgage_stdev',
# 'hc_mean', 'hc_median', 'hc_stdev', 'divorced'
# factor 2 - 'hi_sample_weight','hi_samples', 'family_sample_weight', 'family_samples', mortgage_samples
# factor 3 - 'rent_gt_15' - 'rent_gt_50'
# factor 4 - 'age_median'
# factor 5 - 'hs_degree', 'hs_degree_male', 'hs_degree_female'
# factor 6 - 'pct_own', 'married'
# factor 7 - 'debt'
# factor 8 - 'home_equity_second_mortgage', 'second_mortgage'
# factor 9 - 'home_equity', 'bad_debt'
# factor 10 - 'male_age_stdev', 'female_age_stdev'
# factor 11 - 'rent_to_income'
# factor 12 - none
Build a linear Regression model to predict the total monthly expenditure for home mortgages loan. Please refer ‘deplotment_RE.xlsx’. Column hc_mortgage_mean is predicted variable. This is the mean monthly mortgage and owner costs of specified geographical location. Note: Exclude loans from prediction model which have NaN (Not a Number) values for hc_mortgage_mean.
a) Run a model at a Nation level. If the accuracy levels and R square are not satisfactory proceed to below step.
b) Run another model at State level. There are 52 states in USA.
c) Keep below considerations while building a linear regression model. Data Modeling :
Variables should have significant impact on predicting Monthly mortgage and owner costs
Utilize all predictor variable to start with initial hypothesis
R square of 60 percent and above should be achieved
Ensure Multi-collinearity does not exist in dependent variables
Test if predicted variable is normally distributed
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_columns', None)
df_train['type'] = df_train['type'].astype('category')
df_train['type'] = df_train['type'].cat.codes
df_train['type'].unique()
df_test['type'] = df_test['type'].astype('category')
df_test['type'] = df_test['type'].cat.codes
df_test['type'].unique()
features = ['STATEID', 'zip_code', 'type', 'lat', 'lng', 'pop_density', 'rent_mean', 'rent_median', 'rent_stdev', 'hi_mean', 'hi_median', 'hi_stdev', 'hi_sample_weight','hi_samples',
'family_mean', 'family_median', 'family_stdev', 'hc_mean', 'hc_median', 'hc_stdev' ,'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt','hs_degree',
'pct_own', 'married', 'divorced', 'age_median']
target = ['hc_mortgage_mean']
Xtrain = df_train[features]
ytrain = df_train[target]
Xtest = df_test[features]
ytest = df_test[target]
print(Xtrain.shape)
print(ytrain.shape)
print(Xtest.shape)
print(ytest.shape)
Multicollinearity affects the coefficients and p-values, but it does not influence the predictions, precision of the predictions, and the goodness-of-fit statistics. If your primary goal is to make predictions, and you don’t need to understand the role of each independent variable, you don’t need to reduce severe multicollinearity.
The goal is to keep the scores around 5, but will use a cutoff score of 8 or above; however, having something above that does not mean it will be automatically dropped.
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
x_temp = sm.add_constant(Xtrain)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_temp.values, i) for i in range(x_temp.values.shape[1])]
vif["features"] = x_temp.columns
print(vif.round(1))
From the output of the VIF scores, some variables show a severely high score, will need to exclude some features.
drop rent_median, rent_stdev, hi_median, hi_stdev, hi_sample_weight, hi_samples, family_median, family_stdev, hc_median, hc_stdev,
features = ['STATEID', 'type', 'zip_code', 'lat','lng', 'pop_density', 'rent_mean', 'rent_stdev', 'hc_mean', 'hc_stdev', 'family_mean', 'second_mortgage', 'home_equity', 'debt', 'hs_degree',
'pct_own', 'married', 'divorced', 'age_median']
target = ['hc_mortgage_mean']
Xtrain = df_train[features]
ytrain = df_train[target]
Xtest = df_test[features]
ytest = df_test[target]
print(Xtrain.shape)
print(ytrain.shape)
print(Xtest.shape)
print(ytest.shape)
x_temp = sm.add_constant(Xtrain)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_temp.values, i) for i in range(x_temp.values.shape[1])]
vif["features"] = x_temp.columns
print(vif.round(1))
Run a model at a Nation level
# Standardize data:
sc=StandardScaler()
Xtrain=sc.fit_transform(Xtrain)
Xtrain.std(axis=0)
np.set_printoptions(suppress=False)
Xtrain.mean(axis=0)
Xtest = sc.transform(Xtest)
Xtrain
#Perform Linear Regression
linear_regression = linear_model.LinearRegression(fit_intercept=True, normalize=False, n_jobs=1)
linear_regression.fit(Xtrain,ytrain)
linear_regression.coef_
linear_regression.intercept_
y_predict=linear_regression.predict(Xtest)
mse=mean_squared_error(ytest,y_predict)
mse
rmse=math.sqrt(mse)
rmse
# Check the RSQ for the test data
linear_regression.score(X=Xtest,y=ytest)
from sklearn.metrics import r2_score
r2_score(ytest,y_predict)
sns.set()
plt.figure(figsize=(10,8))
sns.distplot(y_predict)
plt.xlabel("y_predict distribution")
plt.show()
The distribution is skewed to the right
Predicted = pd.DataFrame(y_predict,columns=["predicted"])
results = pd.concat([(ytest.reset_index(drop=True)),Predicted],axis=1)
df1 = results[0:100]
df1.head(10)
features = ['STATEID', 'type', 'zip_code', 'lat','lng', 'pop_density', 'rent_mean', 'rent_stdev', 'hc_mean', 'hc_stdev', 'family_mean', 'second_mortgage', 'home_equity', 'debt', 'hs_degree',
'pct_own', 'married', 'divorced', 'age_median']
target = ['hc_mortgage_mean']
df_state_train = df_train[df_train['state'] == 'Texas']
df_state_test = df_test[df_test['state'] == 'Texas']
Xtrain = df_state_train[features]
ytrain = df_state_train[target]
Xtest = df_state_test[features]
ytest = df_state_test[target]
print(Xtrain.shape)
print(ytrain.shape)
print(Xtest.shape)
print(ytest.shape)
sc=StandardScaler()
Xtrain=sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)
linear_regression = linear_model.LinearRegression(fit_intercept=True,n_jobs=1)
linear_regression.fit(Xtrain,ytrain)
linear_regression.coef_
linear_regression.intercept_
y_predict=linear_regression.predict(Xtest)
mse=mean_squared_error(ytest,y_predict)
mse
rmse=math.sqrt(mse)
rmse
linear_regression.score(X=Xtest,y=ytest)
Create a dashboard in tableau by choosing appropriate chart types and metrics useful for the business. The dashboard must entail the following:
Box plot of distribution of average rent by type of place (village, urban, town, etc.).
Pie charts to show overall debt and bad debt.
Explore the top 2,500 locations where the percentage of households with a second mortgage is the highest and percent ownership is above 10 percent. Visualize using geo-map.
Heat map for correlation matrix.
Pie chart to show the population distribution across different types of places (village, urban, town etc.)
Export cleaned and preprocessed datas for tableau display
df_train.to_csv('df_train.csv')
top_household_2nd_mortgage.to_csv('top_household_2nd_mortgage.csv')